In [82]:
    
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('bmh')
%matplotlib inline
    
In [83]:
    
# To learn more about the data set https://archive.ics.uci.edu/ml/datasets/pima+indians+diabetes
data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data'
df = pd.read_csv(data_url)
    
In [84]:
    
df.head()
    
    Out[84]:
In [85]:
    
columns = ["#pregnancies", "glucose_conc", "blood_pressure", 
           "skin_thickness", "serum_insulin", "bmi", "dpf", "age", "class"]
df.columns = columns
    
In [86]:
    
df.head()
    
    Out[86]:
In [87]:
    
df.shape
    
    Out[87]:
In [88]:
    
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
    
In [89]:
    
# Split data into a training and testing datasets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
    
In [90]:
    
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
    
In [91]:
    
# Logistic Regression
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_train_score = lr.score(X_train, y_train)
lr_test_score = lr.score(X_test, y_test)
print("Accuracy of training score is", train_score)
print("Accuracy of testing score is", test_score )
    
    
In [92]:
    
# Random Forest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_train_score = rf.score(X_train, y_train)
rf_test_score = rf.score(X_test, y_test)
print("Accuracy of training score is", rf_train_score)
print("Accuracy of testing score is", rf_test_score)
    
    
In [93]:
    
# Naive Bayse Classifier
nb = MultinomialNB()
nb.fit(X_train, y_train)
nb_train_score = nb.score(X_train, y_train)
nb_test_score = nb.score(X_test, y_test)
print("Accuracy of training score is", nb_train_score)
print("Accuracy of testing score is", nb_test_score)
    
    
In [94]:
    
# Support Vector Machines
svm = SVC()
svm.fit(X_train, y_train)
svm_train_score = svm.score(X_train, y_train)
svm_test_score = svm.score(X_test, y_test)
print("Accuracy of training score is", svm_train_score)
print("Accuracy of testing score is", svm_test_score)
    
    
In [95]:
    
# Plotting the results
classifiers = ["Logistic_Reg", "Random_Forest", "Naive_Bayes", "Support_Vector"]
y_axis = range(len(classifiers))
scores = [lr_test_score, rf_test_score, nb_test_score, svm_test_score]
plt.bar(y_axis, scores, align='center', alpha=0.5)
plt.xticks(y_axis, classifiers)
plt.ylabel('Testing score')
plt.title('Comparison of ML classifiers')
    
    Out[95]:
    
In [96]:
    
from sklearn.grid_search import GridSearchCV
    
In [103]:
    
grid_values = {
    'n_estimators': (5, 10, 20, 50),
    'max_depth': (50, 150, 250),
    'min_samples_split': [2, 3],
    'min_samples_leaf': (1, 2, 3)
}
grid_search = GridSearchCV(rf, param_grid=grid_values, verbose=1, n_jobs=-1, cv=3)
grid_search.fit(X_train, y_train)
    
    
    
    Out[103]:
In [104]:
    
print ('Best score: %0.3f' % grid_search.best_score_)
print ('Best parameters set:')
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(grid_values.keys()):
    print ('\t%s: %r' % (param_name, best_parameters[param_name]))
    
    
In [ ]: